'''
The purpose of this script is to aggregate the raw tweets at different levels.

It also creates lagged values as well.

Designed to work on Python 3.5.
'''
##########################
##
##	GLOBALS
##
##########################
import os
import pandas as pd
import numpy as np
from scipy import stats  # For follower percentile distribution

#os.chdir('<path/to/Replication/>')

##########################
##
##	DATA
##
##########################
###### FROM BOTOMETER
data = pd.read_csv('./Data/02_processedData/c2_DonghyeonAlexmerged_classifiers_shortSpain.csv')
data = data.dropna(subset=['place.bounding_box.coordinates'])  # These items will not have ceated_at as a field either, so is useless for aggregating since no time.


data['tweets'] = 1  # To count tweets by unit of aggregation
data['followers'] = data['user.followers_count'] + 1  # Can't have 0s when weight
data['day'] = [str(item)[0:10] for item in data['local_time_geocoord']]
data['hour'] = [int(item[11:13]) for item in data['local_time_geocoord']]


# Some country codes are now weird, not sure why.  But to first fix, you have to get rid of the NaN rows.
temp = [item.replace('b\'', '') for item in data['place.country_code']]
temp = [item.replace('\'', '') for item in temp]
data['place.country_code'] = temp

temp = [item.replace('b\'', '') for item in data['country']]
temp = [item.replace('\'', '') for item in temp]
data['country'] = temp

###### FACE RESULTS
new_faces = pd.read_csv('./Data/02_processedData/c_DonghyeonAlexmerged_Newclassifiers_ShortSpain.csv')
new_faces = new_faces[['id', 'totalFaces', 'raceWhite', 'raceIndian', 'raceLatino', 'raceMiddleEast', 'raceBlack', 'raceEastAsian', 'raceSEAsian', 'facesMale', 'facesFemale', 'faces0_2', 'faces3_9', 'faces10_19', 'faces20_29', 'faces30_39', 'faces40_49', 'faces50_59', 'faces60_69', 'faces70plus', 'entropyRace', 'entropyGender', 'entropyAge']]

data = pd.merge(data, new_faces, on='id', how='left')

###### UPDATE COLUMN NAMES
data = data.drop(['entropyRace_x', 'entropyGender_x'], axis=1)
data = data.rename(columns={'entropyRace_y': 'entropyRace', 'entropyGender_y': 'entropyGender', 'raceWhite_y': 'raceWhite', 'raceBlack_y': 'raceBlack'})

##########################
##
##	UPDATE CITY NAMES
##
##########################

#####  CYRILLIC
cities = list(set(data['city_use']))

original = ['Абакан','Адлер','Асино','Ачинск','Балашиха','Балтийск','Белгород','Благовещенск','Брянск','Великий Новгород','Владивосток','Владимир','Волгоград','Воронеж','Горно-Алтайск','Екатеринбург','Железнодорожный','Зеленодольск','Иваново','Ивантеевка','Ижевск','Иркутск','Казань','Калининград','Калуга','Канск','Кашира','Кемерово','Киров','Колпино','Королев','Кострома','Котельники','Красково','Краснодар','Красноярск','Кронштадт','Курган','Курск','Лангепас','Липецк','Лосино-Петровский','Лысьва','Магадан','Минусинск','Москва','Мурманск','Назарово','Новокузнецк','Новосибирск','Одинцово','Омск','Оренбург','Отрадный','Пенза','Пермь','Псков','Пушкин','Реутов','Ростов-на-Дону','Рубцовск','Салтыковка','Самара','Санкт-Петербург','Саранск','Саратов','Серпухов','Сиверский','Смоленск','Сочи','Ставрополь','Стерлитамак','Тольятти','Томск','Тула','Тулун','Тюмень','Улан-Удэ','Ульяновск','Уфа','Хабаровск','Химки','Холмск','Челябинск','Черкесск','Черногорск','Чистополь','Чита','Шадринск','Щелково','Энгельс','Южно-Сахалинск','Якутск','Ярославль',]

fromGoogle = ['Abakan','Adler','Asino','Achinsk','Balashikha','Baltiysk','Belgorod','Blagoveshchensk','Bryansk','Veliky Novgorod','Vladivostok','Vladimir','Volgograd','Voronezh','Gorno-Altaisk','Yekaterinburg','Zheleznodorozhny','Zelenodolsk','Ivanovo','Ivanteevka','Izhevsk','Irkutsk','Kazan','Kaliningrad','Kaluga','Kansk','Kashira','Kemerovo','Kirov','Kolpino','Korolev','Kostroma','Kotelniki','Kraskovo','Krasnodar','Krasnoyarsk','Kronstadt','Kurgan','Kursk','Langepas',' Lipetsk','Losino-Petrovsky','Lysva','Magadan','Minusinsk','Moscow','Murmansk','Nazarovo','Novokuz Netsk','Novosibirsk','Odintsovo','Omsk','Orenburg','Otradny','Penza','Perm','Pskov','Pushkin','Reutov','Rostov-on-Don','Rubtsovsk','Saltykovka','Samara','St. Petersburg','Saransk','Saratov','Serpukhov','Siversky','Smolensk','Sochi','Stavropol','Sterlitamak','Tolyatti','Tomsk','Tula','Tulun','Tyumen','Ulan-Ude','Ulyanovsk','Ufa','Khabarovsk','Khimki','Kholmsk','Chelyabinsk' ,'Cherkessk','Chernogorsk','Chistopol','Chita','Shadrinsk','Schelkovo','Engels','Yuzhno-Sakhalinsk','Yakutsk','Yaroslavl']


# Translation dictionary
translation = {}
for o, f in zip(original, fromGoogle):
	translation[o] = f

# Replace Russian entries
for item in data['city_use']:
	if item in translation:
		data.loc[data['city_use'] == item, 'city_use'] = translation[item]


##### HONG KONG
original = {'Sha Tin': 'Central', 'Tsuen Wan': 'Central', 'Wan Chai': 'Central'}

# Update for Hong Kong
for item in data['city_use']:
	if item in original:
		data.loc[data['city_use'] == item, 'city_use'] = original[item]

data['rg.state'] = data['rg.state'].replace('Sha Tin', 'Central and Western')
data['rg.state'] = data['rg.state'].replace('Tsuen Wan', 'Central and Western')
data['rg.state'] = data['rg.state'].replace('Wanchai', 'Central and Western')


##### SOUTH KOREA
original = {'Bucheon-si': 'Seoul', 'Guri-si': 'Seoul', 'Kwangmyong': 'Seoul', 'Seongnam-si': 'Seoul'}

# Update for Seoul
for item in data['city_use']:
	if item in original:
		data.loc[data['city_use'] == item, 'city_use'] = original[item]

data['rg.state'] = data['rg.state'].replace('Gyeonggi-do', 'Seoul')


##########################
##
##	FUNCTIONS
##
##########################
def doAggregation(data, file_modifier, column_agg):
	# Note that the order of the groupby matters, need to go from biggest to smaller to get the right sorting for the lags
	countryday = data.groupby(['place.country_code', 'day']).agg(column_agg)
	stateday = data.groupby(['place.country_code', 'rg.state', 'day']).agg(column_agg)
	cityday = data.groupby(['place.country_code', 'rg.state', 'city_use', 'day']).agg(column_agg)


	# Generate lags by grouping.  level=0 is the key to lag on index
	cityday_l = cityday.groupby(level=0).shift(1)
	stateday_l = stateday.groupby(level=0).shift(1)
	countryday_l = countryday.groupby(level=0).shift(1)

	# Merge the lagged variables in
	cityday = pd.merge(cityday, cityday_l, left_index=True, right_index=True, suffixes=('', '_lag'))
	stateday = pd.merge(stateday, stateday_l, left_index=True, right_index=True, suffixes=('', '_lag'))
	countryday = pd.merge(countryday, countryday_l, left_index=True, right_index=True, suffixes=('', '_lag'))



	########
	#
	#	MOST OF THE REST OF THE FUNCTION CALCULATES ENTROPY AT EACH LEVEL OF AGGREGATION.
	#	THERE IS PROBABLY A CLEANER WAY TO DO IT, BUT THIS WORKS FOR NOW.
	#
	#######

	# Calculate entropy by summing things in images, not taking average entropy per image.
	# Formula from: https://stackoverflow.com/questions/49973537/shannons-entropy-on-an-array-containing-zeros
	def newEntropy(x):
		A = x

		# Are NaN values where no faces, so that will sum to 0 if all are nan.
		if np.sum(A) == 0.0:
			return np.nan
		# If at least one face that day, calculate entropy.
		if np.sum(A) > 0:
			pA = A / A.sum()
			Shannon2 = -np.nansum(pA * np.log2(pA))

			return Shannon2

	####### City.  Make series data frame, rename column to be meaningful
	temp_city_race = data.groupby(['place.country_code', 'rg.state', 'city_use', 'day'])[['raceWhite', 'raceIndian', 'raceLatino', 'raceMiddleEast', 'raceBlack', 'raceEastAsian', 'raceSEAsian']].agg('sum').apply(newEntropy, axis=1).to_frame().rename(columns={0: 'entropyRace_byDay'})
	temp_city_gender = data.groupby(['place.country_code', 'rg.state', 'city_use', 'day'])[['facesMale', 'facesFemale']].agg('sum').apply(newEntropy, axis=1).to_frame().rename(columns={0: 'entropyGender_byDay'})
	temp_city_age = data.groupby(['place.country_code', 'rg.state', 'city_use', 'day'])[['faces0_2', 'faces3_9', 'faces10_19', 'faces20_29', 'faces30_39', 'faces40_49', 'faces50_59', 'faces60_69', 'faces70plus']].agg('sum').apply(newEntropy, axis=1).to_frame().rename(columns={0: 'entropyAge_byDay'})


	## Lag
	temp_city_race_l = temp_city_race.groupby(level=0).shift(1)
	temp_city_gender_l = temp_city_gender.groupby(level=0).shift(1)
	temp_city_age_l = temp_city_age.groupby(level=0).shift(1)

	## Merge lags in
	temp_city_race = pd.merge(temp_city_race, temp_city_race_l, left_index=True, right_index=True, suffixes=('', '_lag'))
	temp_city_gender = pd.merge(temp_city_gender, temp_city_gender_l, left_index=True, right_index=True, suffixes=('', '_lag'))
	temp_city_age = pd.merge(temp_city_age, temp_city_age_l, left_index=True, right_index=True, suffixes=('', '_lag'))

	# Merge all entropies
	temp_city = pd.merge(temp_city_race, temp_city_gender, left_index=True, right_index=True)
	temp_city = pd.merge(temp_city, temp_city_age, left_index=True, right_index=True)

	##### Merge back to main
	cityday = pd.merge(cityday, temp_city, left_index=True, right_index=True)

	####### State.  Make series data frame, rename column to be meaningful
	temp_state_race = data.groupby(['place.country_code', 'rg.state', 'day'])[['raceWhite', 'raceIndian', 'raceLatino', 'raceMiddleEast', 'raceBlack', 'raceEastAsian', 'raceSEAsian']].agg('sum').apply(newEntropy, axis=1).to_frame().rename(columns={0: 'entropyRace_byDay'})
	temp_state_gender = data.groupby(['place.country_code', 'rg.state', 'day'])[['facesMale', 'facesFemale']].agg('sum').apply(newEntropy, axis=1).to_frame().rename(columns={0: 'entropyGender_byDay'})
	temp_state_age = data.groupby(['place.country_code', 'rg.state', 'day'])[['faces0_2', 'faces3_9', 'faces10_19', 'faces20_29', 'faces30_39', 'faces40_49', 'faces50_59', 'faces60_69', 'faces70plus']].agg('sum').apply(newEntropy, axis=1).to_frame().rename(columns={0: 'entropyAge_byDay'})

	## Lag
	temp_state_race_l = temp_state_race.groupby(level=0).shift(1)
	temp_state_gender_l = temp_state_gender.groupby(level=0).shift(1)
	temp_state_age_l = temp_state_age.groupby(level=0).shift(1)

	## Merge lags in
	temp_state_race = pd.merge(temp_state_race, temp_state_race_l, left_index=True, right_index=True, suffixes=('', '_lag'))
	temp_state_gender = pd.merge(temp_state_gender, temp_state_gender_l, left_index=True, right_index=True, suffixes=('', '_lag'))
	temp_state_age = pd.merge(temp_state_age, temp_state_age_l, left_index=True, right_index=True, suffixes=('', '_lag'))

	# Merge all entropies
	temp_state = pd.merge(temp_state_race, temp_state_gender, left_index=True, right_index=True)
	temp_state = pd.merge(temp_state, temp_state_age, left_index=True, right_index=True)

	##### Merge back to main
	stateday = pd.merge(stateday, temp_state, left_index=True, right_index=True)

	####### Country.  Make series data frame, rename column to be meaningful
	temp_country_race = data.groupby(['place.country_code', 'day'])[['raceWhite', 'raceIndian', 'raceLatino', 'raceMiddleEast', 'raceBlack', 'raceEastAsian', 'raceSEAsian']].agg('sum').apply(newEntropy, axis=1).to_frame().rename(columns={0: 'entropyRace_byDay'})
	temp_country_gender = data.groupby(['place.country_code', 'day'])[['facesMale', 'facesFemale']].agg('sum').apply(newEntropy, axis=1).to_frame().rename(columns={0: 'entropyGender_byDay'})
	temp_country_age = data.groupby(['place.country_code', 'day'])[['faces0_2', 'faces3_9', 'faces10_19', 'faces20_29', 'faces30_39', 'faces40_49', 'faces50_59', 'faces60_69', 'faces70plus']].agg('sum').apply(newEntropy, axis=1).to_frame().rename(columns={0: 'entropyAge_byDay'})

	## Lag
	temp_country_race_l = temp_country_race.groupby(level=0).shift(1)
	temp_country_gender_l = temp_country_gender.groupby(level=0).shift(1)
	temp_country_age_l = temp_country_age.groupby(level=0).shift(1)

	## Merge lags in
	temp_country_race = pd.merge(temp_country_race, temp_country_race_l, left_index=True, right_index=True, suffixes=('', '_lag'))
	temp_country_gender = pd.merge(temp_country_gender, temp_country_gender_l, left_index=True, right_index=True, suffixes=('', '_lag'))
	temp_country_age = pd.merge(temp_country_age, temp_country_age_l, left_index=True, right_index=True, suffixes=('', '_lag'))

	# Merge all entropies
	temp_country = pd.merge(temp_country_race, temp_country_gender, left_index=True, right_index=True)
	temp_country = pd.merge(temp_country, temp_country_age, left_index=True, right_index=True)

	##### Merge back to main
	countryday = pd.merge(countryday, temp_country, left_index=True, right_index=True)

	########
	#
	# ABOVE ENTROPY IS DONE
	#
	########

	# RESET INDEX
	cityday = cityday.reset_index()
	stateday = stateday.reset_index()
	countryday = countryday.reset_index()

	## Save out
	cityday.to_csv('./Data/02_processedData/d_DonghyeonAlexmerged_cityday_withNewClassifierOutput' + file_modifier + '.csv', index=False)
	stateday.to_csv('./Data/02_processedData/d_DonghyeonAlexmerged_stateday_withNewClassifierOutput' + file_modifier + '.csv', index=False)
	countryday.to_csv('./Data/02_processedData/d_DonghyeonAlexmerged_countryday_withNewClassifierOutput' + file_modifier + '.csv', index=False)


def countProtesterPhotos(data, file_modifier):
		##	PROTESTERS, PROTEST IMAGES PER DAY
	def countUsers(group=data['user.id']):
		number = len(set(group))

		return number

	def countTweets(group=data['tweets']):
		number = np.sum(group)

		return number

	countryday2 = data.groupby(['place.country_code', 'day']).agg({'user.id': countUsers, 'tweets': countTweets})
	stateday2 = data.groupby(['place.country_code', 'rg.state', 'day']).agg({'user.id': countUsers, 'tweets': countTweets})
	cityday2 = data.groupby(['place.country_code', 'rg.state', 'city_use', 'day']).agg({'user.id': countUsers, 'tweets': countTweets})

	# Generate lags by grouping.  level=0 is the key to lag on index
	cityday2_l = cityday2.groupby(level=0).shift(1)
	stateday2_l = stateday2.groupby(level=0).shift(1)
	countryday2_l = countryday2.groupby(level=0).shift(1)

	# Merge the lagged variables in
	cityday2 = pd.merge(cityday2, cityday2_l, left_index=True, right_index=True, suffixes=('', '_lag'))
	stateday2 = pd.merge(stateday2, stateday2_l, left_index=True, right_index=True, suffixes=('', '_lag'))
	countryday2 = pd.merge(countryday2, countryday2_l, left_index=True, right_index=True, suffixes=('', '_lag'))

	cityday2 = cityday2.reset_index()
	stateday2 = stateday2.reset_index()
	countryday2 = countryday2.reset_index()

	cityday2.to_csv('./Data/02_processedData/d_DonghyeonAlexmerged_cityday_Protesters_Tweets' + file_modifier + '.csv', index=False)
	countryday2.to_csv('./Data/02_processedData/d_DonghyeonAlexmerged_countryday_Protesters_Tweets' + file_modifier + '.csv', index=False)
	stateday2.to_csv('./Data/02_processedData/d_DonghyeonAlexmerged_stateday_Protesters_Tweets' + file_modifier + '.csv', index=False)


##########################
##
##	AGGREGATION DICTIONARIES
##
##########################
# Below is for the merged data, have to use Donghyeon's naming convention.
column_agg = {'tweets': 'sum', 'protest_result.children': 'mean', 'protest_result.fire': 'mean', 'protest_result.flag': 'mean', 'protest_result.group_100': 'mean', 'protest_result.group_20': 'mean', 'protest_result.night': 'mean', 'protest_result.photo': 'mean', 'protest_result.police': 'mean', 'protest_result.protest': 'mean', 'protest_result.shouting': 'mean', 'protest_result.sign': 'mean', 'protest_result.violence': 'mean', 'protest_result.protester_violence': 'mean', 'protest_result.state_violence': 'mean', 'binary_fire': 'sum', 'binary_children': 'sum', 'binary_flag': 'sum', 'binary_group_100': 'sum', 'binary_group_20': 'sum', 'binary_night': 'sum', 'binary_photo': 'sum', 'binary_police': 'sum', 'binary_protest': 'sum', 'binary_shouting': 'sum', 'binary_sign': 'sum', 'binary_violence': 'sum', 'binary_groupAny': 'sum', 'binary_groupChildren': 'sum', 'binary_ViolencePolice': 'sum', 'binary_ViolenceFire': 'sum', 'binary_ViolencePoliceFire': 'sum', 'binary_protester_violence': 'sum', 'binary_state_violence': 'sum'}

# These lists will create new variables and dictionaries
faceVars = ['totalFaces', 'raceWhite', 'raceIndian', 'raceLatino', 'raceMiddleEast', 'raceBlack', 'raceEastAsian', 'raceSEAsian', 'facesMale', 'facesFemale', 'faces0_2', 'faces3_9', 'faces10_19', 'faces20_29', 'faces30_39', 'faces40_49', 'faces50_59', 'faces60_69', 'faces70plus', 'faces']
entropies = ['entropyRace', 'entropyGender', 'entropyAge']


# Create new variables with suffix corresponding to function for aggregation
face_agg_mean = {}
face_agg_sum = {}

for item in faceVars:
	data[item + '_Sum'] = data[item]
	data[item + '_Mean'] = data[item]

	face_agg_mean[item + '_Mean'] = 'mean'
	face_agg_sum[item + '_Sum'] = 'sum'

# Only entropy means, does not make sense to sum.
for item in entropies:
	data[item + '_Mean'] = data[item]
	face_agg_mean[item + '_Mean'] = 'mean'

# Merge all together
column_agg.update(face_agg_mean)
column_agg.update(face_agg_sum)


##########################
##
##	NOTE ABOUT AGGREGATING
##
##########################
# It would be easier to aggregate as seen here: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.groupby.DataFrameGroupBy.agg.html
# and here: https://stackoverflow.com/questions/14507794/python-pandas-how-to-flatten-a-hierarchical-index-in-columns.groupby

# But since I want slightly different aggregation functions per step of aggregation, I do not do that.

##########################
##
##	WORK
##
##########################

#########
# Make subsets
#########

#### Verified
verified = data[data['user.verified'] == False]  # Do not use verified accounts, they may behave strategically.

#### Country is not inferred
good_bb = data[data['local_time_geocoord_no_landmass'] == False]  # Centroid of bounding box can be over a body of water, so let's remove those just to safe.

#### Source is not desktop
these = ['Twitter Web Client', 'Hootsuite']
data['desktop'] = [1 if any(x in item for x in these) else 0 for item in data['source']]
mobile = data[data['desktop'] == 0]


#### Tweet is in country language
# Languages from https://developer.twitter.com/en/docs/twitter-for-websites/twitter-for-websites-supported-languages/overview.html
# Country code is ISO 3166
# Dictionary so some countries, mainly Ukraine, can have multiple dominant languages.
languages = {'FR': {'fr'}, 'VE': {'es'}, 'TN': {'ar'}, 'PH': {'tg'}, 'BR': {'pt'}, 'TH': {'th'}, 'EC': {'es'}, 'HU': {'hu'}, 'NI': {'es'}, 'IR': {'fa'}, 'RU': {'ru'}, 'AM': {''}, 'IQ': {'ar'}, 'KR': {'ko'}, 'HK': {'en'}, 'ES': {'es'}, 'UA': {'ru', 'uk'}, 'PK': {'ur'}, 'EG': {'ar'}, 'GA': {'und'}, 'BY': {'ru'}}

# NB: Had to add below column, which doesn't quite feel right.  Gave it an obvious name so it doesn't get buried.  Code may break later, but that will be my reminder.
data['DonghyeonEvents_tweet_lang'] = data['lang']

data['dominant_language'] = [1 if tl in languages[cc] else 0 for cc, tl in zip(data['place.country_code'], data['DonghyeonEvents_tweet_lang'])]

dominant = data[data['dominant_language'] == 1]

#### Tweet is from between 10 a.m. and 10 p.m.
time_narrow = data[(data['hour'] >= 10) & (data['hour'] <= 22)]

#### User is 25th-75th percentile popularity based on country-month.
## Will still aggregate by city, state, country day.  I think making popularity by country month is fine, these are national networks.  k
data['yearmonth'] = [item[0:7] for item in data['day']]

def getPercentile(data):
	data['followerPercentile'] = [stats.percentileofscore(data['user.followers_count'], item) for item in data['user.followers_count']]

	return data


data = data.groupby(['place.country_code', 'yearmonth']).apply(getPercentile)
middlePopularity = data[(data['followerPercentile'] >= 25) & (data['followerPercentile'] <= 75)]


#### Tweet not likely from bot
## What to make the threshold?  .4, based on ``Online Human-Bot Interactions: Detection, Estimation, and Characterization". 2016, Varol, Ferrara, Davis et a.  They go a little higher, so .4 is conservative.
data.loc[data['bot_cap_universal'] == 'User_no_longer_available', 'bot_cap_universal'] = np.nan
data['bot_cap_universal'] = [float(item) for item in data['bot_cap_universal']]

data['bot'] = [1 if item >= .4 else 0 for item in data['bot_cap_universal']]

nobot = data[data['bot'] == 0]


#### Remove duplicate images
duplicates = pd.read_csv('./Data/02_processedData/c2_DonghyeonAlexmerged_classifiers_ShortSpain_dedupDetect.csv')

# Merge because added metadata
data = pd.merge(data, duplicates[['id', 'deduplicate_id']], left_on='id', right_on='id', how='left')

noduplicates = data.drop_duplicates(subset='deduplicate_id', keep='first')

#########
# Aggregations
#########
# On all data, no subsets
doAggregation(data=data, file_modifier='', column_agg=column_agg)
countProtesterPhotos(data=data, file_modifier='')

# On verified
doAggregation(data=verified, file_modifier='_verifiedAccounts', column_agg=column_agg)
countProtesterPhotos(data=verified, file_modifier='_verifiedAccounts')

# Tweets only from mobile devices
doAggregation(data=mobile, file_modifier='_mobileTweets', column_agg=column_agg)
countProtesterPhotos(data=mobile, file_modifier='_mobileTweets')

# Tweets are in country's dominant language
doAggregation(data=dominant, file_modifier='_dominantLanguage', column_agg=column_agg)
countProtesterPhotos(data=dominant, file_modifier='_dominantLanguage')

# Tweets are from a normal time window
doAggregation(data=time_narrow, file_modifier='_timeNarrow', column_agg=column_agg)
countProtesterPhotos(data=time_narrow, file_modifier='_timeNarrow')

# Tweets are from normal user popularity
doAggregation(data=middlePopularity, file_modifier='_2575Popularity', column_agg=column_agg)
countProtesterPhotos(data=time_narrow, file_modifier='_2575Popularity')


# On good bounding boxes
doAggregation(data=good_bb, file_modifier='_goodBB', column_agg=column_agg)
countProtesterPhotos(data=good_bb, file_modifier='_goodBB')

# Tweets are likely not from bot
doAggregation(data=nobot, file_modifier='_noBot', column_agg=column_agg)
countProtesterPhotos(data=nobot, file_modifier='_noBot')

# Tweets are not duplicates
doAggregation(noduplicates, file_modifier='_noDuplicate', column_agg=column_agg)
countProtesterPhotos(data=noduplicates, file_modifier='_noDuplicate')
